if (require("PageRank")) {
library(PageRank)
}else{
devtools::install_github("ryangreenup/PageRank")
library(PageRank)
}
library(pacman)
pacman::p_load(PageRank, devtools, Matrix, igraph, mise, tidyverse, rgl, latex2exp)
# mise()
Define some constants
m <- seq(from = 1, to = 10, length.out = 5)
m <- 5
beta <- seq(from = 1, to = 20, length.out = 10)
sz <- seq(from = 10, to = 1000, length.out = 10)
input_var <- expand.grid("m" = m, "beta" = beta, "size" = sz)
input_var
random_graph <- function(m, beta, size) {
g1 <- igraph::sample_pa(n = size, power = 3, m = m)
A <- igraph::get.adjacency(g1) # Row to column
A <- Matrix::t(A)
A_dens <- mean(A)
T <- PageRank::power_walk_prob_trans(A, beta = beta)
tr <- sum(diag(T))
e2 <- eigen(T, only.values = TRUE)$values[2] # R orders by descending magnitude
return(c(abs(e2), mean(A), tr))
}
Map the function
rm(Y, data)
nc <- length(random_graph(1, 1, 1))
Y <- matrix(ncol = nc, nrow = nrow(input_var))
for (i in 1:nrow(input_var)) {
X <- as.vector(input_var[i,])
Y[i,] <- random_graph(X$m, X$beta, X$size)
print(i/nrow(input_var))
}
[1] 0.01
[1] 0.02
[1] 0.03
[1] 0.04
[1] 0.05
[1] 0.06
[1] 0.07
[1] 0.08
[1] 0.09
[1] 0.1
[1] 0.11
[1] 0.12
[1] 0.13
[1] 0.14
[1] 0.15
[1] 0.16
[1] 0.17
[1] 0.18
[1] 0.19
[1] 0.2
[1] 0.21
[1] 0.22
[1] 0.23
[1] 0.24
[1] 0.25
[1] 0.26
[1] 0.27
[1] 0.28
[1] 0.29
[1] 0.3
[1] 0.31
[1] 0.32
[1] 0.33
[1] 0.34
[1] 0.35
[1] 0.36
[1] 0.37
[1] 0.38
[1] 0.39
[1] 0.4
[1] 0.41
[1] 0.42
[1] 0.43
[1] 0.44
[1] 0.45
[1] 0.46
[1] 0.47
[1] 0.48
[1] 0.49
[1] 0.5
[1] 0.51
[1] 0.52
[1] 0.53
[1] 0.54
[1] 0.55
[1] 0.56
[1] 0.57
[1] 0.58
[1] 0.59
[1] 0.6
[1] 0.61
[1] 0.62
[1] 0.63
[1] 0.64
[1] 0.65
[1] 0.66
[1] 0.67
[1] 0.68
[1] 0.69
[1] 0.7
[1] 0.71
[1] 0.72
[1] 0.73
[1] 0.74
[1] 0.75
[1] 0.76
[1] 0.77
[1] 0.78
[1] 0.79
[1] 0.8
[1] 0.81
[1] 0.82
[1] 0.83
[1] 0.84
[1] 0.85
[1] 0.86
[1] 0.87
[1] 0.88
[1] 0.89
[1] 0.9
[1] 0.91
[1] 0.92
[1] 0.93
[1] 0.94
[1] 0.95
[1] 0.96
[1] 0.97
[1] 0.98
[1] 0.99
[1] 1
if (sum(abs(Y) != abs(Re(Y))) == 0) {
Y <- Re(Y)
}
nrow(input_var)
[1] 100
nrow(Y)
[1] 100
Y <- as.data.frame(Y); colnames(Y) <- c("eigenvalue2", "A_dens", "trace")
(data <- cbind(input_var, Y)) %>% head()
pairs(data)
cor(data)
the standard deviation is zero
m beta size eigenvalue2
m 1 NA NA NA
beta NA 1.000000e+00 4.635990e-21 0.3091320
size NA 4.635990e-21 1.000000e+00 -0.6207504
eigenvalue2 NA 3.091320e-01 -6.207504e-01 1.0000000
A_dens NA -2.642753e-21 -5.916467e-01 0.8649706
trace NA -3.880604e-01 6.586858e-01 -0.9874588
A_dens trace
m NA NA
beta -2.642753e-21 -0.3880604
size -5.916467e-01 0.6586858
eigenvalue2 8.649706e-01 -0.9874588
A_dens 1.000000e+00 -0.8183481
trace -8.183481e-01 1.0000000
library(corrplot)
cormat = cor(data, method = 'spearman')
the standard deviation is zero
corrplot(cormat, method = "ellipse", type = "lower")
names(data)
[1] "m" "beta" "size" "eigenvalue2"
[5] "A_dens" "trace"
Let’s look at a 3d Output
fig
Error: Tibble columns must have compatible sizes.
* Size 40: Column `x`.
* Size 100: Columns `y` and `z`.
ℹ Only values of size one are recycled.
names(data)
[1] "m" "beta" "size" "eigenvalue2"
[5] "A_dens" "trace"
library(plotly)
# d <- data[sample(1:nrow(data), 1000),]
d <- data
fig <- plot_ly(d, x = ~A_dens, y = ~trace, z = ~eigenvalue2)
fig <- fig %>% add_markers(size = 1)
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Density'),
yaxis = list(title = 'trace'),
zaxis = list(title = 'E2')))
fig
NA
Clearly I should be able to model this.
Let’s look at density, it’s continuous so I should be able to take splices of A_dens.
A_dens is dependent on p, so I’ll just use different ps
m <- seq(from = 1, to = 9, length.out = 2)
beta <- seq(from = 1, to = 6, length.out = 10)
sz <- seq(from = 100, to = 1000, length.out = 4) %>% rev() # Big numbers first
input_var <- expand.grid("m" = m, "beta" = beta, "size" = sz)
input_var
rm(Y, data)
nc <- length(random_graph(1, 1, 1))
Y <- matrix(ncol = nc, nrow = nrow(input_var))
for (i in 1:nrow(input_var)) {
X <- as.vector(input_var[i,])
Y[i,] <- random_graph(X$m, X$beta, X$size)
print(i/nrow(input_var))
}
[1] 0.0125
[1] 0.025
[1] 0.0375
[1] 0.05
[1] 0.0625
[1] 0.075
[1] 0.0875
[1] 0.1
[1] 0.1125
[1] 0.125
[1] 0.1375
[1] 0.15
[1] 0.1625
[1] 0.175
[1] 0.1875
[1] 0.2
[1] 0.2125
[1] 0.225
[1] 0.2375
[1] 0.25
[1] 0.2625
[1] 0.275
[1] 0.2875
[1] 0.3
[1] 0.3125
[1] 0.325
[1] 0.3375
[1] 0.35
[1] 0.3625
[1] 0.375
[1] 0.3875
[1] 0.4
[1] 0.4125
[1] 0.425
[1] 0.4375
[1] 0.45
[1] 0.4625
[1] 0.475
[1] 0.4875
[1] 0.5
[1] 0.5125
[1] 0.525
[1] 0.5375
[1] 0.55
[1] 0.5625
[1] 0.575
[1] 0.5875
[1] 0.6
[1] 0.6125
[1] 0.625
[1] 0.6375
[1] 0.65
[1] 0.6625
[1] 0.675
[1] 0.6875
[1] 0.7
[1] 0.7125
[1] 0.725
[1] 0.7375
[1] 0.75
[1] 0.7625
[1] 0.775
[1] 0.7875
[1] 0.8
[1] 0.8125
[1] 0.825
[1] 0.8375
[1] 0.85
[1] 0.8625
[1] 0.875
[1] 0.8875
[1] 0.9
[1] 0.9125
[1] 0.925
[1] 0.9375
[1] 0.95
[1] 0.9625
[1] 0.975
[1] 0.9875
[1] 1
if (sum(abs(Y) != abs(Re(Y))) == 0) {
Y <- Re(Y)
}
nrow(input_var)
[1] 80
nrow(Y)
[1] 80
Y <- as.data.frame(Y); colnames(Y) <- c("eigenvalue2", "A_dens", "trace")
(data2 <- cbind(input_var, Y)) %>% head()
# save(data2, file = "/home/ryan/Sync/Studies/2020Spring/DiscProj/data2_ba.rdata")
# load(file = "/home/ryan/Sync/Studies/2020Spring/DiscProj/data2_ba.rdata")
data2$p <- (data2$m/data2$size)
names(data2)
[1] "m" "beta" "size" "eigenvalue2"
[5] "A_dens" "trace" "p"
ggplot(data2, mapping = aes(col = factor(p), x = beta, y = eigenvalue2)) +
geom_point(size = 0.5) +
stat_smooth(method = 'lm') +
scale_size_continuous(range = c(0.1,1)) +
labs(x = "Beta", y = TeX("Second Eigenvalue"), title = TeX("Second Eigenvalue given Matrix Density") ) +
guides(col = guide_legend("Link Density (by m/n)")) +
theme_bw()
It appears that m/n=density is its own feature, it does not affect E2 linearly so lets hold beta constant and see how it affects E2.
If for instance m/n increases, say quadratically for a constant beta, then I should try and fit a linear model to (m/n)^2*beta
So what about when I very m, i.e. lets map that to colour now:
Even though it’s hard to capture the behaviour of m/n, the size of the graph and the average outdegree are going to be constants and so this relationship should persist for graphs where m<<n.
actually if they are all linearly related, maybe I could use a multiple linear regression?
Clearly non-normal residuals but +-5% is pretty good for a very complicated result.
I think I already looked at this but this again shows the relly clear, near linear relationship.
Obviously we can’t know m, but, we can know the average out degree, so let’s, take a graph, measure the average out degree, use our model, measure it’s performance.